# Credits Analysis: Data Processing Functions
# Alex F. Gazmararian
# agazmararian@gmail.com

#' Extract last name from full name string
#' @param name Character vector of full names
#' @return Character vector of last names
extract_last_name <- function(name) {
    purrr::map_chr(name, function(n) {
        if (is.na(n) || n == "") return(NA_character_)
        
        # Remove titles and suffixes
        n <- stringr::str_replace_all(n, "\\b(Mr|Mrs|Ms|Dr|Prof|Senator|Rep|Representative)\\.?\\s*", "")
        n <- stringr::str_replace_all(n, "\\s+(Jr|Sr|III|II|IV)\\.?$", "")
        
        # Split by space and take last element
        parts <- stringr::str_split(stringr::str_trim(n), "\\s+")[[1]]
        if (length(parts) > 0) {
            return(parts[length(parts)])
        } else {
            return(NA_character_)
        }
    })
}

#' Clean district text by removing specified patterns
#' @param district_text Character vector of district descriptions
#' @param ... Character patterns to remove
#' @return Character vector of cleaned district text
clean_district_text <- function(district_text, ...) {
  removal_patterns <- c(...)
  pattern_collapse <- paste0(c(...), collapse = "|")
  
  stringr::str_remove_all(district_text, pattern_collapse) %>%
    stringr::str_trim() %>%
    stringr::str_squish()
}

#' Validate raw input data
#' @param data Data frame to validate
#' @param stage_name Character string describing the validation stage
validate_raw_data <- function(data, stage_name) {
  message("=== Data Validation: ", stage_name, " ===")
  
  # Check for required columns
  message(sprintf("Rows: %d", nrow(data)))
  message(sprintf("Columns: %d", ncol(data)))
  
  # Check for missing values
  missing_summary <- data %>%
    dplyr::summarise(dplyr::across(dplyr::everything(), ~ sum(is.na(.))))
  
  missing_cols <- names(missing_summary)[missing_summary > 0]
  if (length(missing_cols) > 0) {
    message("Columns with missing values:")
    for (col in missing_cols) {
      message(sprintf("  %s: %d missing", col, missing_summary[[col]]))
    }
  } else {
    message("No missing values found")
  }
  
  invisible(data)
}

#' Load member data from XML nodes
#' @param xml_node XML node to extract data from
#' @param type Type of member data ("house" or "senate")
#' @return Tibble with member information
load_member_data <- function(xml_node, type = c("house", "senate")) {
    type <- match.arg(type)
    
    if (type == "house") {
        tibble::tibble(
            last_name = xml2::xml_text(xml2::xml_find_first(xml_node, ".//member-info/lastname")),
            first_name = xml2::xml_text(xml2::xml_find_first(xml_node, ".//member-info/firstname")),
            state = xml2::xml_text(xml2::xml_find_first(xml_node, ".//member-info/state")),
            district = xml2::xml_text(xml2::xml_find_first(xml_node, ".//member-info/district")),
            party = xml2::xml_text(xml2::xml_find_first(xml_node, ".//member-info/party"))
        )
    } else {
        tibble::tibble(
            first_name = xml2::xml_text(xml2::xml_find_first(xml_node, "first_name")),
            last_name = xml2::xml_text(xml2::xml_find_first(xml_node, "last_name")),
            party = xml2::xml_text(xml2::xml_find_first(xml_node, "party")),
            state = xml2::xml_text(xml2::xml_find_first(xml_node, "state"))
        )
    }
}

#' Standardize party names
#' @param party_text Character vector of party names
#' @param ... Additional party keywords to recognize
#' @return Character vector of standardized party names
detect_party <- function(party_text, ...) {
  party_keywords <- c(...)
  keyword_pattern <- paste(party_keywords, collapse = "|")
  
  dplyr::case_when(
    stringr::str_detect(stringr::str_to_lower(party_text), "democrat|dem\\b") ~ "Democrat",
    stringr::str_detect(stringr::str_to_lower(party_text), "republican|rep\\b|gop") ~ "Republican",
    stringr::str_detect(stringr::str_to_lower(party_text), "independent|ind\\b") ~ "Independent",
    !is.na(keyword_pattern) & stringr::str_detect(stringr::str_to_lower(party_text), keyword_pattern) ~ "Other",
    TRUE ~ "Unknown"
  )
}
